1 package org.apache.lucene.analysis.ar;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 import static org.apache.lucene.analysis.util.StemmerUtil.*;
22
23
24
25
26
27
28
29
30
31
32
33
34
35 public class ArabicStemmer {
36 public static final char ALEF = '\u0627';
37 public static final char BEH = '\u0628';
38 public static final char TEH_MARBUTA = '\u0629';
39 public static final char TEH = '\u062A';
40 public static final char FEH = '\u0641';
41 public static final char KAF = '\u0643';
42 public static final char LAM = '\u0644';
43 public static final char NOON = '\u0646';
44 public static final char HEH = '\u0647';
45 public static final char WAW = '\u0648';
46 public static final char YEH = '\u064A';
47
48 public static final char prefixes[][] = {
49 ("" + ALEF + LAM).toCharArray(),
50 ("" + WAW + ALEF + LAM).toCharArray(),
51 ("" + BEH + ALEF + LAM).toCharArray(),
52 ("" + KAF + ALEF + LAM).toCharArray(),
53 ("" + FEH + ALEF + LAM).toCharArray(),
54 ("" + LAM + LAM).toCharArray(),
55 ("" + WAW).toCharArray(),
56 };
57
58 public static final char suffixes[][] = {
59 ("" + HEH + ALEF).toCharArray(),
60 ("" + ALEF + NOON).toCharArray(),
61 ("" + ALEF + TEH).toCharArray(),
62 ("" + WAW + NOON).toCharArray(),
63 ("" + YEH + NOON).toCharArray(),
64 ("" + YEH + HEH).toCharArray(),
65 ("" + YEH + TEH_MARBUTA).toCharArray(),
66 ("" + HEH).toCharArray(),
67 ("" + TEH_MARBUTA).toCharArray(),
68 ("" + YEH).toCharArray(),
69 };
70
71
72
73
74
75
76
77
78 public int stem(char s[], int len) {
79 len = stemPrefix(s, len);
80 len = stemSuffix(s, len);
81
82 return len;
83 }
84
85
86
87
88
89
90
91 public int stemPrefix(char s[], int len) {
92 for (int i = 0; i < prefixes.length; i++)
93 if (startsWithCheckLength(s, len, prefixes[i]))
94 return deleteN(s, 0, len, prefixes[i].length);
95 return len;
96 }
97
98
99
100
101
102
103
104 public int stemSuffix(char s[], int len) {
105 for (int i = 0; i < suffixes.length; i++)
106 if (endsWithCheckLength(s, len, suffixes[i]))
107 len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
108 return len;
109 }
110
111
112
113
114
115
116
117
118 boolean startsWithCheckLength(char s[], int len, char prefix[]) {
119 if (prefix.length == 1 && len < 4) {
120 return false;
121 } else if (len < prefix.length + 2) {
122 return false;
123 } else {
124 for (int i = 0; i < prefix.length; i++)
125 if (s[i] != prefix[i])
126 return false;
127
128 return true;
129 }
130 }
131
132
133
134
135
136
137
138
139 boolean endsWithCheckLength(char s[], int len, char suffix[]) {
140 if (len < suffix.length + 2) {
141 return false;
142 } else {
143 for (int i = 0; i < suffix.length; i++)
144 if (s[len - suffix.length + i] != suffix[i])
145 return false;
146
147 return true;
148 }
149 }
150 }